/*
 * Copyright (c) 2017 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

template <const unsigned int tail>
inline void sgemm_4x16_impl(
  const float* const a, const float* const b, float *c,
  const int M, const int K, const int N,
  const int a_row_stride,
  const int b_row_stride,
  const int c_row_stride
);

template <>
inline void sgemm_4x16_impl<0>(
  const float* const a, const float* const b, float *c,
  const int M, const int K, const int N,
  const int a_row_stride,
  const int b_row_stride,
  const int c_row_stride
) {
  const int TAIL_SIZE = 0;
  const int M_BLOCK = 4;
  const int N_BLOCK = 16;

  const int m_blocks = iceildiv(M, M_BLOCK);
  const int n_blocks = iceildiv(N, N_BLOCK);

  // For each block of output rows
  for (int mblock = 0; mblock < m_blocks; mblock++) {
    // For each block of output columns
    for (int nblock = 0; nblock < n_blocks; nblock++) {
      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
      const float *bptr = b + nblock*N_BLOCK;
      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
      int k = (K - TAIL_SIZE) / 4;

      asm volatile(
        "aptr2 .req X20\n"
        "aptr3 .req X21\n"
        "aptr4 .req X22\n"
        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
        "vB1 .req v20\n" "qB1 .req q20\n"
        "vB2 .req v21\n" "qB2 .req q21\n"
        "vB3 .req v22\n" "qB3 .req q22\n"
        "vB4 .req v23\n" "qB4 .req q23\n"

        // Clear accumulators, initialise pointers
        "movi vC11.4s, #0\n"
        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
        "movi vC12.4s, #0\n"
        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
        "movi vC13.4s, #0\n"
        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
        "movi vC14.4s, #0\n"
        "ldr qA1, [%x[aptr]], #0x10\n"
        "movi vC21.4s, #0\n"
        "ldr qA2, [   aptr2], #0x10\n"
        "movi vC22.4s, #0\n"
        "ldr qB1, [%x[bptr], #0x00]\n"
        "movi vC23.4s, #0\n"
        "ldr qB2, [%x[bptr], #0x10]\n"
        "movi vC24.4s, #0\n"
        "ldr qB3, [%x[bptr], #0x20]\n"
        "movi vC31.4s, #0\n"
        "movi vC32.4s, #0\n"
        "movi vC33.4s, #0\n"
        "movi vC34.4s, #0\n"
        "movi vC41.4s, #0\n"
        "movi vC42.4s, #0\n"
        "movi vC43.4s, #0\n"
        "movi vC44.4s, #0\n"
        "subs %x[k], %x[k], #1\n"
        "beq 2f\n"

        "1:"  // Loop proper
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "subs %x[k], %x[k], #1\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
          "bne 1b\n"

        "2:"  // Tail
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "stp qC11, qC12, [%x[cptr], #0x00]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "stp qC13, qC14, [%x[cptr], #0x20]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "stp qC21, qC22, [%x[cptr], #0x00]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "stp qC23, qC24, [%x[cptr], #0x20]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "stp qC31, qC32, [%x[cptr], #0x00]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "stp qC33, qC34, [%x[cptr], #0x20]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "stp qC41, qC42, [%x[cptr], #0x00]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
          "stp qC43, qC44, [%x[cptr], #0x20]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"

        ".unreq vB4\n" ".unreq qB4\n"
        ".unreq vB3\n" ".unreq qB3\n"
        ".unreq vB2\n" ".unreq qB2\n"
        ".unreq vB1\n" ".unreq qB1\n"
        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
        ".unreq aptr2\n"
        ".unreq aptr3\n"
        ".unreq aptr4\n"

        : [aptr] "+r" (aptr),
          [bptr] "+r" (bptr),
          [cptr] "+r" (cptr),
          [k] "+r" (k)
        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
        : "cc", "memory", "x20", "x21", "x22",
          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
          "v21", "v22", "v23"
      );
    }
  }
}

template <>
inline void sgemm_4x16_impl<1>(
  const float* const a, const float* const b, float *c,
  const int M, const int K, const int N,
  const int a_row_stride,
  const int b_row_stride,
  const int c_row_stride
) {
  const int TAIL_SIZE = 1;
  const int M_BLOCK = 4;
  const int N_BLOCK = 16;

  const int m_blocks = iceildiv(M, M_BLOCK);
  const int n_blocks = iceildiv(N, N_BLOCK);

  // For each block of output rows
  for (int mblock = 0; mblock < m_blocks; mblock++) {
    // For each block of output columns
    for (int nblock = 0; nblock < n_blocks; nblock++) {
      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
      const float *bptr = b + nblock*N_BLOCK;
      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
      int k = (K - TAIL_SIZE) / 4;

      asm volatile(
        "aptr2 .req X20\n"
        "aptr3 .req X21\n"
        "aptr4 .req X22\n"
        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
        "vB1 .req v20\n" "qB1 .req q20\n"
        "vB2 .req v21\n" "qB2 .req q21\n"
        "vB3 .req v22\n" "qB3 .req q22\n"
        "vB4 .req v23\n" "qB4 .req q23\n"

        // Clear accumulators, initialise pointers
        "movi vC11.4s, #0\n"
        "ldr qB1, [%x[bptr], #0x00]\n"
        "movi vC12.4s, #0\n"
        "ldr qB2, [%x[bptr], #0x10]\n"
        "movi vC13.4s, #0\n"
        "ldr qB3, [%x[bptr], #0x20]\n"
        "movi vC14.4s, #0\n"
        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
        "movi vC21.4s, #0\n"
        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
        "movi vC22.4s, #0\n"
        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
        "movi vC23.4s, #0\n"
        "cbnz %x[k], 3f\n"

        // Prepare for tail in K
        "movi vC24.4s, #0\n"
        "ldr sA1, [%x[aptr]], #0x04\n"
        "movi vC31.4s, #0\n"
        "ldr sA2, [   aptr2], #0x04\n"
        "movi vC32.4s, #0\n"
        "movi vC33.4s, #0\n"
        "movi vC34.4s, #0\n"
        "movi vC41.4s, #0\n"
        "movi vC42.4s, #0\n"
        "movi vC43.4s, #0\n"
        "movi vC44.4s, #0\n"
        "b 2f\n"  // Jump to tail

        "3:"  // Prepare for loop over K
          "movi vC24.4s, #0\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "movi vC31.4s, #0\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "movi vC32.4s, #0\n"
          "movi vC33.4s, #0\n"
          "movi vC34.4s, #0\n"
          "movi vC41.4s, #0\n"
          "movi vC42.4s, #0\n"
          "movi vC43.4s, #0\n"
          "movi vC44.4s, #0\n"
          "subs %x[k], %x[k], #1\n"
          "beq 4f\n"

        "1:"  // Loop proper
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "subs %x[k], %x[k], #1\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
          "bne 1b\n"

        "4:"  // Tail iteration
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr sA1, [%x[aptr]], #0x04\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr sA2, [   aptr2], #0x04\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"

        "2:"  // Common tail
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "stp qC11, qC12, [%x[cptr], #0x00]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "ldr sA3, [   aptr3], #0x04\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "stp qC13, qC14, [%x[cptr], #0x20]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "stp qC21, qC22, [%x[cptr], #0x00]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "ldr sA4, [   aptr4], #0x04\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "stp qC23, qC24, [%x[cptr], #0x20]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "stp qC31, qC32, [%x[cptr], #0x00]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "stp qC33, qC34, [%x[cptr], #0x20]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "stp qC41, qC42, [%x[cptr], #0x00]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
          "stp qC43, qC44, [%x[cptr], #0x20]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"

        ".unreq vB4\n" ".unreq qB4\n"
        ".unreq vB3\n" ".unreq qB3\n"
        ".unreq vB2\n" ".unreq qB2\n"
        ".unreq vB1\n" ".unreq qB1\n"
        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
        ".unreq aptr2\n"
        ".unreq aptr3\n"
        ".unreq aptr4\n"

        : [aptr] "+r" (aptr),
          [bptr] "+r" (bptr),
          [cptr] "+r" (cptr),
          [k] "+r" (k)
        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
        : "cc", "memory", "x20", "x21", "x22",
          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
          "v21", "v22", "v23"
      );
    }
  }
}

template <>
inline void sgemm_4x16_impl<2>(
  const float* const a, const float* const b, float *c,
  const int M, const int K, const int N,
  const int a_row_stride,
  const int b_row_stride,
  const int c_row_stride
) {
  const int TAIL_SIZE = 2;
  const int M_BLOCK = 4;
  const int N_BLOCK = 16;

  const int m_blocks = iceildiv(M, M_BLOCK);
  const int n_blocks = iceildiv(N, N_BLOCK);

  // For each block of output rows
  for (int mblock = 0; mblock < m_blocks; mblock++) {
    // For each block of output columns
    for (int nblock = 0; nblock < n_blocks; nblock++) {
      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
      const float *bptr = b + nblock*N_BLOCK;
      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
      int k = (K - TAIL_SIZE) / 4;

      asm volatile(
        "aptr2 .req X20\n"
        "aptr3 .req X21\n"
        "aptr4 .req X22\n"
        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
        "vB1 .req v20\n" "qB1 .req q20\n"
        "vB2 .req v21\n" "qB2 .req q21\n"
        "vB3 .req v22\n" "qB3 .req q22\n"
        "vB4 .req v23\n" "qB4 .req q23\n"

        // Clear accumulators, initialise pointers
        "movi vC11.4s, #0\n"
        "ldr qB1, [%x[bptr], #0x00]\n"
        "movi vC12.4s, #0\n"
        "ldr qB2, [%x[bptr], #0x10]\n"
        "movi vC13.4s, #0\n"
        "ldr qB3, [%x[bptr], #0x20]\n"
        "movi vC14.4s, #0\n"
        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
        "movi vC21.4s, #0\n"
        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
        "movi vC22.4s, #0\n"
        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
        "movi vC23.4s, #0\n"
        "cbnz %x[k], 3f\n"

        // Prepare for tail in K
        "movi vC24.4s, #0\n"
        "ldr dA1, [%x[aptr]], #0x08\n"
        "movi vC31.4s, #0\n"
        "ldr dA2, [   aptr2], #0x08\n"
        "movi vC32.4s, #0\n"
        "movi vC33.4s, #0\n"
        "movi vC34.4s, #0\n"
        "movi vC41.4s, #0\n"
        "movi vC42.4s, #0\n"
        "movi vC43.4s, #0\n"
        "movi vC44.4s, #0\n"
        "b 2f\n"  // Jump to tail

        "3:"  // Prepare for loop over K
          "movi vC24.4s, #0\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "movi vC31.4s, #0\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "movi vC32.4s, #0\n"
          "movi vC33.4s, #0\n"
          "movi vC34.4s, #0\n"
          "movi vC41.4s, #0\n"
          "movi vC42.4s, #0\n"
          "movi vC43.4s, #0\n"
          "movi vC44.4s, #0\n"
          "subs %x[k], %x[k], #1\n"
          "beq 4f\n"

        "1:"  // Loop proper
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "subs %x[k], %x[k], #1\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
          "bne 1b\n"

        "4:"  // Tail iteration
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr dA1, [%x[aptr]], #0x08\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr dA2, [   aptr2], #0x08\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"

        "2:"  // Common tail
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr dA3, [   aptr3], #0x08\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr dA4, [   aptr4], #0x08\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "stp qC11, qC12, [%x[cptr], #0x00]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "stp qC13, qC14, [%x[cptr], #0x20]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "stp qC21, qC22, [%x[cptr], #0x00]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "stp qC23, qC24, [%x[cptr], #0x20]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "stp qC31, qC32, [%x[cptr], #0x00]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "stp qC33, qC34, [%x[cptr], #0x20]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "stp qC41, qC42, [%x[cptr], #0x00]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
          "stp qC43, qC44, [%x[cptr], #0x20]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"

        ".unreq vB4\n" ".unreq qB4\n"
        ".unreq vB3\n" ".unreq qB3\n"
        ".unreq vB2\n" ".unreq qB2\n"
        ".unreq vB1\n" ".unreq qB1\n"
        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
        ".unreq aptr2\n"
        ".unreq aptr3\n"
        ".unreq aptr4\n"

        : [aptr] "+r" (aptr),
          [bptr] "+r" (bptr),
          [cptr] "+r" (cptr),
          [k] "+r" (k)
        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
        : "cc", "memory", "x20", "x21", "x22",
          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
          "v21", "v22", "v23"
      );
    }
  }
}

template <>
inline void sgemm_4x16_impl<3>(
  const float* const a, const float* const b, float *c,
  const int M, const int K, const int N,
  const int a_row_stride,
  const int b_row_stride,
  const int c_row_stride
) {
  const int TAIL_SIZE = 3;
  const int M_BLOCK = 4;
  const int N_BLOCK = 16;

  const int m_blocks = iceildiv(M, M_BLOCK);
  const int n_blocks = iceildiv(N, N_BLOCK);

  // For each block of output rows
  for (int mblock = 0; mblock < m_blocks; mblock++) {
    // For each block of output columns
    for (int nblock = 0; nblock < n_blocks; nblock++) {
      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
      const float *bptr = b + nblock*N_BLOCK;
      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
      int k = (K - TAIL_SIZE) / 4;

      asm volatile(
        "aptr2 .req X20\n"
        "aptr3 .req X21\n"
        "aptr4 .req X22\n"
        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
        "vB1 .req v20\n" "qB1 .req q20\n"
        "vB2 .req v21\n" "qB2 .req q21\n"
        "vB3 .req v22\n" "qB3 .req q22\n"
        "vB4 .req v23\n" "qB4 .req q23\n"

        // Clear accumulators, initialise pointers
        "movi vC11.4s, #0\n"
        "ldr qB1, [%x[bptr], #0x00]\n"
        "movi vC12.4s, #0\n"
        "ldr qB2, [%x[bptr], #0x10]\n"
        "movi vC13.4s, #0\n"
        "ldr qB3, [%x[bptr], #0x20]\n"
        "movi vC14.4s, #0\n"
        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
        "movi vC21.4s, #0\n"
        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
        "movi vC22.4s, #0\n"
        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
        "movi vC23.4s, #0\n"
        "cbnz %x[k], 3f\n"

        // Prepare for tail in K
        "movi vC24.4s, #0\n"
        "ldr dA1, [%x[aptr]], #0x08\n"
        "movi vC31.4s, #0\n"
        "ldr dA2, [   aptr2], #0x08\n"
        "movi vC32.4s, #0\n"
        "movi vC33.4s, #0\n"
        "movi vC34.4s, #0\n"
        "movi vC41.4s, #0\n"
        "movi vC42.4s, #0\n"
        "movi vC43.4s, #0\n"
        "movi vC44.4s, #0\n"
        "b 2f\n"  // Jump to tail

        "3:"  // Prepare for loop over K
          "movi vC24.4s, #0\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "movi vC31.4s, #0\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "movi vC32.4s, #0\n"
          "movi vC33.4s, #0\n"
          "movi vC34.4s, #0\n"
          "movi vC41.4s, #0\n"
          "movi vC42.4s, #0\n"
          "movi vC43.4s, #0\n"
          "movi vC44.4s, #0\n"
          "subs %x[k], %x[k], #1\n"
          "beq 4f\n"

        "1:"  // Loop proper
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "subs %x[k], %x[k], #1\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr qA1, [%x[aptr]], #0x10\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr qA2, [   aptr2], #0x10\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
          "bne 1b\n"

        "4:"  // Tail iteration
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qA3, [   aptr3], #0x10\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr qA4, [   aptr4], #0x10\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
          "ldr dA1, [%x[aptr]], #0x08\n"
          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
          "ldr dA2, [   aptr2], #0x08\n"
          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"

        "2:"  // Common tail
          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr dA3, [   aptr3], #0x08\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "ldr dA4, [   aptr4], #0x08\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
          "ldr qB1, [%x[bptr], #0x00]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
          "ldr qB2, [%x[bptr], #0x10]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
          "ldr sA1, [%x[aptr]], #0x04\n"
          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
          "ldr sA2, [   aptr2], #0x04\n"
          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
          "ldr qB3, [%x[bptr], #0x20]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"

          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
          "ldr qB4, [%x[bptr], #0x30]\n"
          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
          "stp qC11, qC12, [%x[cptr], #0x00]\n"
          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
          "ldr sA3, [   aptr3], #0x04\n"
          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
          "stp qC13, qC14, [%x[cptr], #0x20]\n"
          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
          "stp qC21, qC22, [%x[cptr], #0x00]\n"
          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
          "ldr sA4, [   aptr4], #0x04\n"
          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
          "stp qC23, qC24, [%x[cptr], #0x20]\n"
          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
          "stp qC31, qC32, [%x[cptr], #0x00]\n"
          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
          "stp qC33, qC34, [%x[cptr], #0x20]\n"
          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
          "stp qC41, qC42, [%x[cptr], #0x00]\n"
          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
          "stp qC43, qC44, [%x[cptr], #0x20]\n"
          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"

        ".unreq vB4\n" ".unreq qB4\n"
        ".unreq vB3\n" ".unreq qB3\n"
        ".unreq vB2\n" ".unreq qB2\n"
        ".unreq vB1\n" ".unreq qB1\n"
        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
        ".unreq aptr2\n"
        ".unreq aptr3\n"
        ".unreq aptr4\n"

        : [aptr] "+r" (aptr),
          [bptr] "+r" (bptr),
          [cptr] "+r" (cptr),
          [k] "+r" (k)
        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
        : "cc", "memory", "x20", "x21", "x22",
          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
          "v21", "v22", "v23"
      );
    }
  }
}
